# this code clears the environment and installs/loads popular packages
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 521553 27.9 1155494 61.8 NA 669314 35.8
## Vcells 961026 7.4 8388608 64.0 16384 1840016 14.1
cat("\f")
packages <- c("readr", #open csv
"psych", # quick summary stats for data exploration,
"stargazer", #summary stats for sharing,
"tidyverse", # data manipulation like selecting variables,
"corrplot", # correlation plots
"ggplot2", # graphing
"ggcorrplot", # correlation plot
"gridExtra", #overlay plots
"data.table", # reshape for graphing
"car", #vif
"prettydoc", # html output
"visdat", # visualize missing variables
"glmnet", # lasso/ridge
"caret", # confusion matrix
"MASS", #step AIC
"plm", # fixed effects demeaned regression
"lmtest", # test regression coefficients
"fpp3", # Foprecasting: Principles & Practice supplement
"tsibble",
"tsibbledata",
"lubridate",
"forecast",
"seasonal"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
rm(packages)
filter() to find what days corresponded
to the peak closing price for each of the four stocks in
gafa_stockdata(gafa_stock)
gafa_stock %>%
group_by(Symbol) %>%
filter(Close == max(Close))
## # A tsibble: 4 x 8 [!]
## # Key: Symbol [4]
## # Groups: Symbol [4]
## Symbol Date Open High Low Close Adj_Close Volume
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL 2018-10-03 230. 233. 230. 232. 230. 28654800
## 2 AMZN 2018-09-04 2026. 2050. 2013 2040. 2040. 5721100
## 3 FB 2018-07-25 216. 219. 214. 218. 218. 58954200
## 4 GOOG 2018-07-26 1251 1270. 1249. 1268. 1268. 2405600
tute1.csv from the book website, open it in Excel
(or some other spreadsheet application), and review its contents. You
should find four columns of information. Columns B through D each
contain a quarterly series, labelled Sales, AdBudget and GDP. Sales
contains the quarterly sales for a small company over the period
1981-2005. AdBudget is the advertising budget and GDP is the gross
domestic product. All series have been adjusted for inflation.tute1 <- readr::read_csv("tute1.csv")
head(tute1, 4)
## # A tibble: 4 × 4
## Quarter Sales AdBudget GDP
## <date> <dbl> <dbl> <dbl>
## 1 1981-03-01 1020. 659. 252.
## 2 1981-06-01 889. 589 291.
## 3 1981-09-01 795 512. 291.
## 4 1981-12-01 1004. 614. 292.
mytimeseries <- tute1 %>%
mutate(Quarter = yearquarter(Quarter)) %>%
as_tsibble(index = Quarter)
mytimeseries %>%
pivot_longer(-Quarter) %>%
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line() +
facet_grid(name ~ ., scales = "free_y")
The exercise prompts us to: Check what happens when you don’t include
facet_grid().
mytimeseries %>%
pivot_longer(-Quarter) %>%
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line()
The facet_grid()
function allows for the variables to be displayed using their own corresponsing value scales (y axis) while keeping the X axis the same. It shows more detail in the variance of the variables.
aus_livestock data contains the
monthly total number of pigs slaughtered in Victoria, Australia, from
Jul 1972 to Dec 2018. Use filter() to extract pig
slaughters in Victoria between 1990 and 1995. Use
autoplot() and ACF() for this data. How do
they differ from white noise? If a longer period of data is used, what
difference does it make to the ACF?data(aus_livestock)
aus_livestock %>%
mutate(Month = yearmonth(Month)) %>%
as_tsibble(index = Month)
## # A tsibble: 29,364 x 4 [1M]
## # Key: Animal, State [54]
## Month Animal State Count
## <mth> <fct> <fct> <dbl>
## 1 1976 Jul Bulls, bullocks and steers Australian Capital Territory 2300
## 2 1976 Aug Bulls, bullocks and steers Australian Capital Territory 2100
## 3 1976 Sep Bulls, bullocks and steers Australian Capital Territory 2100
## 4 1976 Oct Bulls, bullocks and steers Australian Capital Territory 1900
## 5 1976 Nov Bulls, bullocks and steers Australian Capital Territory 2100
## 6 1976 Dec Bulls, bullocks and steers Australian Capital Territory 1800
## 7 1977 Jan Bulls, bullocks and steers Australian Capital Territory 1800
## 8 1977 Feb Bulls, bullocks and steers Australian Capital Territory 1900
## 9 1977 Mar Bulls, bullocks and steers Australian Capital Territory 2700
## 10 1977 Apr Bulls, bullocks and steers Australian Capital Territory 2300
## # … with 29,354 more rows
victoria <- aus_livestock %>%
filter(State == "Victoria",
Animal == "Pigs",
year(Month) %in% 1990:1995)
ACF(victoria, Count) %>%
autoplot()
There looks to be serious autocorrelation from lags as shown in the ACF. If longer time periods are used:
victoria2 <- aus_livestock %>%
filter(State == "Victoria",
Animal == "Pigs",
year(Month) %in% 1980:2018)
ACF(victoria2, Count) %>%
autoplot()
the autocorrelation remains and even appears stronger. This means this is not a white noise dataset and there is some clear seasonal trends at almost all lags.
canadian_gas data?autoplot(canadian_gas, Volume)
Since the changes in variation over time are not consistent (not increasing over time, for example) the Box-Cox will be ineffective in standardizing the variation. Let’s plot a Box-Cox transform to show:
lambda <- canadian_gas %>%
features(Volume, features = guerrero) %>%
pull(lambda_guerrero)
canadian_gas %>%
autoplot(box_cox(Volume,lambda))
We see here that there is still inconsistent variance in the data.
canadian_gas data
(monthly Canadian gas production in billions of cubic metres, January
1960 – February 2005).autoplot(),
gg_subseries() and gg_season() to look at the
effect of the changing seasonality over time.canadian_gas %>%
autoplot(Volume)+
labs(title = "Monthly Canadian Gas Production")
canadian_gas %>%
gg_subseries(Volume)+
labs(title = "Monthly Canadian Gas Production")
canadian_gas %>%
gg_season(Volume)+
labs(title = "Monthly Canadian Gas Production")
canadian_gas %>%
model(
STL(Volume ~ trend(window = 21) + # window = 21 for monthly data
season(window = "periodic"),
robust = TRUE)) %>%
components() %>%
autoplot()
Looking at the seasonal chart above, we see that over time demand still dips in the summer months but is less severe. Demand changes due to the weather and generally follows the same trend.
canadian_gas %>%
model(`STL` = STL(Volume ~ trend(window = 7) + season(window = 21))) %>%
components() %>%
gg_season(season_year)
canadian_gas %>%
model(`STL` = STL(Volume ~ trend(window = 7) + season(window = 21))) %>%
components() %>%
pull(season_adjust) -> canadian_gas_adjusted
canadian_gas %>%
model(
STL(Volume ~ trend(window = 21) +
season(window = 13),
robust = TRUE)) %>%
components() %>%
ggplot(aes(x = Month)) +
geom_line(aes(y = Volume, colour = "Data")) +
geom_line(aes(y = season_adjust,
colour = "Seasonally Adjusted")) +
geom_line(aes(y = trend, colour = "Trend")) +
labs(title = "STL decomposition of Canadian Gas Production") +
scale_colour_manual(
values = c("gray", "#0072B2", "#D55E00"),
breaks = c("Data", "Seasonally Adjusted", "Trend")
)
X-11
x11_dcmp <- canadian_gas %>%
model(x11 = X_13ARIMA_SEATS(Volume ~ x11())) %>%
components()
autoplot(x11_dcmp) +
labs(title =
"Decomposition of total Canadian Gas using X-11.")
SEATS
seats_dcmp <- canadian_gas %>%
model(seats = X_13ARIMA_SEATS(Volume ~ seats())) %>%
components()
autoplot(seats_dcmp) +
labs(title =
"Decomposition of Canadian GAs using SEATS")
The irregular series is lower with SEATS. Other trends look similar.